import pandas as pd
import altair as alt
train_data = pd.read_csv('train.csv', index_col=0)
train_data
| MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Id | |||||||||||||||||||||
| 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | Inside | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | FR2 | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | Inside | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
| 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | Corner | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | FR2 | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1456 | 60 | RL | 62.0 | 7917 | Pave | NaN | Reg | Lvl | AllPub | Inside | ... | 0 | NaN | NaN | NaN | 0 | 8 | 2007 | WD | Normal | 175000 |
| 1457 | 20 | RL | 85.0 | 13175 | Pave | NaN | Reg | Lvl | AllPub | Inside | ... | 0 | NaN | MnPrv | NaN | 0 | 2 | 2010 | WD | Normal | 210000 |
| 1458 | 70 | RL | 66.0 | 9042 | Pave | NaN | Reg | Lvl | AllPub | Inside | ... | 0 | NaN | GdPrv | Shed | 2500 | 5 | 2010 | WD | Normal | 266500 |
| 1459 | 20 | RL | 68.0 | 9717 | Pave | NaN | Reg | Lvl | AllPub | Inside | ... | 0 | NaN | NaN | NaN | 0 | 4 | 2010 | WD | Normal | 142125 |
| 1460 | 20 | RL | 75.0 | 9937 | Pave | NaN | Reg | Lvl | AllPub | Inside | ... | 0 | NaN | NaN | NaN | 0 | 6 | 2008 | WD | Normal | 147500 |
1460 rows × 80 columns
X_train = train_data.drop(columns='SalePrice')
X_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1460 entries, 1 to 1460 Data columns (total 79 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MSSubClass 1460 non-null int64 1 MSZoning 1460 non-null object 2 LotFrontage 1201 non-null float64 3 LotArea 1460 non-null int64 4 Street 1460 non-null object 5 Alley 91 non-null object 6 LotShape 1460 non-null object 7 LandContour 1460 non-null object 8 Utilities 1460 non-null object 9 LotConfig 1460 non-null object 10 LandSlope 1460 non-null object 11 Neighborhood 1460 non-null object 12 Condition1 1460 non-null object 13 Condition2 1460 non-null object 14 BldgType 1460 non-null object 15 HouseStyle 1460 non-null object 16 OverallQual 1460 non-null int64 17 OverallCond 1460 non-null int64 18 YearBuilt 1460 non-null int64 19 YearRemodAdd 1460 non-null int64 20 RoofStyle 1460 non-null object 21 RoofMatl 1460 non-null object 22 Exterior1st 1460 non-null object 23 Exterior2nd 1460 non-null object 24 MasVnrType 1452 non-null object 25 MasVnrArea 1452 non-null float64 26 ExterQual 1460 non-null object 27 ExterCond 1460 non-null object 28 Foundation 1460 non-null object 29 BsmtQual 1423 non-null object 30 BsmtCond 1423 non-null object 31 BsmtExposure 1422 non-null object 32 BsmtFinType1 1423 non-null object 33 BsmtFinSF1 1460 non-null int64 34 BsmtFinType2 1422 non-null object 35 BsmtFinSF2 1460 non-null int64 36 BsmtUnfSF 1460 non-null int64 37 TotalBsmtSF 1460 non-null int64 38 Heating 1460 non-null object 39 HeatingQC 1460 non-null object 40 CentralAir 1460 non-null object 41 Electrical 1459 non-null object 42 1stFlrSF 1460 non-null int64 43 2ndFlrSF 1460 non-null int64 44 LowQualFinSF 1460 non-null int64 45 GrLivArea 1460 non-null int64 46 BsmtFullBath 1460 non-null int64 47 BsmtHalfBath 1460 non-null int64 48 FullBath 1460 non-null int64 49 HalfBath 1460 non-null int64 50 BedroomAbvGr 1460 non-null int64 51 KitchenAbvGr 1460 non-null int64 52 KitchenQual 1460 non-null object 53 TotRmsAbvGrd 1460 non-null int64 54 Functional 1460 non-null object 55 Fireplaces 1460 non-null int64 56 FireplaceQu 770 non-null object 57 GarageType 1379 non-null object 58 GarageYrBlt 1379 non-null float64 59 GarageFinish 1379 non-null object 60 GarageCars 1460 non-null int64 61 GarageArea 1460 non-null int64 62 GarageQual 1379 non-null object 63 GarageCond 1379 non-null object 64 PavedDrive 1460 non-null object 65 WoodDeckSF 1460 non-null int64 66 OpenPorchSF 1460 non-null int64 67 EnclosedPorch 1460 non-null int64 68 3SsnPorch 1460 non-null int64 69 ScreenPorch 1460 non-null int64 70 PoolArea 1460 non-null int64 71 PoolQC 7 non-null object 72 Fence 281 non-null object 73 MiscFeature 54 non-null object 74 MiscVal 1460 non-null int64 75 MoSold 1460 non-null int64 76 YrSold 1460 non-null int64 77 SaleType 1460 non-null object 78 SaleCondition 1460 non-null object dtypes: float64(3), int64(33), object(43) memory usage: 912.5+ KB
cat_cols = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape',
'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
'RoofMat1', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation',
'Heating', 'Electrical', 'Fireplaces', 'GarageType', 'GarageFinish',
'PavedDrive', 'MiscFeature']
num_cols = ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
'BsmtFinSF1', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageYrBlt','GarageCars',
'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', ]
ord_cols = ['OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual',
'BsmtCond','BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC'
'KitchenQual', 'Functional', 'FireplaceQu', 'GarageQual', 'GarageCond',
'PoolQc', 'Fence', ]
bin_cols = ['CentralAir']
corr_df = (
X_train
.select_dtypes('number')
.corr('spearman')
.abs() # Use abs for negative correlation to stand out
.stack() # Get df into long format for altair
.reset_index(name='corr')) # Name the index that is reset to avoid name collision
alt.Chart(corr_df).mark_circle().encode(
x='level_0',
y='level_1',
size='corr',
color='corr')
scatter_plot = alt.Chart(train_data).mark_point(size = 0.1).encode(
alt.Y(alt.repeat('row'), type='quantitative'),
alt.X(alt.repeat('column'), type='quantitative')
).properties(height = 100, width = 100
)
scatter_plot.repeat(column=num_cols[0:6], row=['SalePrice'])
scatter_plot.repeat(column=num_cols[6:12], row=['SalePrice'])
scatter_plot.repeat(column=num_cols[12:18], row=['SalePrice'])
scatter_plot.repeat(column=num_cols[18:24], row=['SalePrice'])
scatter_plot.repeat(column=num_cols[24:32], row=['SalePrice'])
alt.Chart(train_data).mark_point(size = 0.1).encode(
alt.Y(alt.repeat('row'), type='ordinal'),
alt.X(alt.repeat('column'), type='ordinal')
).properties(height = 100, width = 100
).repeat(column=ord_cols, row=['SalePrice'])